In smoking_status Unknown should be changed to NA.
Also, it can be ordered: never < formerly < smokes
Other predictors seem to be OK
df <- read_csv("data/healthcare-dataset-stroke-data.csv", col_types = "cfdfffffddcf", na = c("Unknown", "N/A"))
# if you set smoking_status to factor in col_types, na() won't work
df$smoking_status <- as_factor(df$smoking_status)
df$smoking_status <- fct_relevel(df$smoking_status, c("never smoked", "formerly smoked", "smokes"))
df$stroke <- factor(ifelse(df$stroke == 1, "yes", "no"), levels = c("no", "yes"))
dfSkip id column
df$id <- NULL
skimr::skim_to_wide(df)| Name | Piped data |
| Number of rows | 5110 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| factor | 8 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| gender | 0 | 1.0 | FALSE | 3 | Fem: 2994, Mal: 2115, Oth: 1 |
| hypertension | 0 | 1.0 | FALSE | 2 | 0: 4612, 1: 498 |
| heart_disease | 0 | 1.0 | FALSE | 2 | 0: 4834, 1: 276 |
| ever_married | 0 | 1.0 | FALSE | 2 | Yes: 3353, No: 1757 |
| work_type | 0 | 1.0 | FALSE | 5 | Pri: 2925, Sel: 819, chi: 687, Gov: 657 |
| Residence_type | 0 | 1.0 | FALSE | 2 | Urb: 2596, Rur: 2514 |
| smoking_status | 1544 | 0.7 | FALSE | 3 | nev: 1892, for: 885, smo: 789 |
| stroke | 0 | 1.0 | FALSE | 2 | no: 4861, yes: 249 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1.00 | 43.23 | 22.61 | 0.08 | 25.00 | 45.00 | 61.00 | 82.00 | ▅▆▇▇▆ |
| avg_glucose_level | 0 | 1.00 | 106.15 | 45.28 | 55.12 | 77.24 | 91.88 | 114.09 | 271.74 | ▇▃▁▁▁ |
| bmi | 201 | 0.96 | 28.89 | 7.85 | 10.30 | 23.50 | 28.10 | 33.10 | 97.60 | ▇▇▁▁▁ |
Target ‘stroke’ is imbalanced!
Smoking’s complete rate 0.7
BMI’s complete rate 0.96
One ‘Other’ gender to be removed
df <- df %>% filter(gender != "Other")GGally::ggpairs(df, aes(color = stroke, alpha = 0.2, dotsize = 0.02),
upper = list(continuous = GGally::wrap("cor", size = 2.5)),
diag = list(continuous = "barDiag")) +
scale_color_brewer(palette = "Set1", direction = -1) +
scale_fill_brewer(palette = "Set1", direction = -1)gender <- df %>% group_by(stroke, gender) %>% summarize(N=n())
ggplot(gender, aes(stroke, N)) +
geom_bar(aes(fill=gender), alpha = 0.8, stat = "identity", position = "fill")+
scale_fill_brewer(palette = "Set2", direction = -1)+
ylab("proportion")Proportions in both stroke groups are roughly the same
ggplot(df, aes(stroke, age)) +
geom_violin(alpha=0.3) +
geom_jitter(alpha=0.2, size=0.8, width = 0.15, height = 0.1, aes(color = gender)) +
geom_boxplot(alpha = 0.2) +
scale_color_brewer(palette = "Set2", direction = -1)
On average, age is higher among stroke-yes group, there is no
interaction between age and gender
hyptens <- df %>% group_by(stroke, hypertension) %>% summarize(N=n())
hyptensggplot(hyptens, aes(stroke, N)) +
geom_bar(aes(fill=hypertension), alpha = 0.8, stat = "identity", position = "fill")+
scale_fill_brewer(palette = "Set2", direction = -1)+
ylab("proportion")
Hypertension occurres more often among stroke-yes
Scale numeric features
df_scaled <- df %>% select(avg_glucose_level, age, bmi) %>% scale() %>% data.frame()I’ve decided to omit smoking_status completely - it won’t be dummified
# select vars
to_dum <- df %>% select(gender, ever_married, work_type, Residence_type)
# make an obj
dummies <- dummyVars(~ ., data=to_dum)
# apply it
df_dummy <- data.frame(predict(dummies, newdata=to_dum))
head(df_dummy)df_proc <- bind_cols(df_scaled, df_dummy, select(df, hypertension, heart_disease, stroke))
df_procI will use median imputation for BMI.
Smoking status will not be processed.
df_proc <- df_proc %>%
mutate(bmi = ifelse(is.na(bmi), median(bmi, na.rm = TRUE), bmi))ROC-optimization is better when data is imbalanced
fit_ctrl_xgb_roc <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
repeats = 10,
allowParallel = T,
classProbs = T,
summaryFunction = twoClassSummary)
fit_ctrl_xgb_acc <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
repeats = 10,
allowParallel = T)Imbalanced data - use SMOTE to create training data set, but not testing data set
set.seed(1234)
sample_set <- createDataPartition(y = df_proc$stroke, p = .75, list = FALSE)
df_train <- df_proc[sample_set,]
df_test <- df_proc[-sample_set,]
# DMwR::SMOTE for imbalanced data
df_train <- SMOTE(stroke ~ ., data.frame(df_train), perc.over = 100, perc.under = 200)set.seed(123)
fit_rf_roc <- train(stroke ~ .,
data = df_train,
metric = "ROC",
method = "rf",
trControl = fit_ctrl_xgb_roc,
verbosity = 0,
verbose = FALSE)
fit_rf_roc## Random Forest
##
## 748 samples
## 17 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 672, 672, 674, 674, 674, 674, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.9155172 0.8218706 0.8334708
## 9 0.9117226 0.8103414 0.8366714
## 17 0.9012339 0.8065932 0.8265007
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
set.seed(123)
fit_rf_acc <- train(stroke ~ .,
data = df_train,
metric = "Accuracy",
method = "rf",
trControl = fit_ctrl_xgb_acc,
verbosity = 0,
verbose = FALSE)
fit_rf_acc## Random Forest
##
## 748 samples
## 17 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 672, 672, 674, 674, 674, 674, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8276459 0.6553001
## 9 0.8234948 0.6469962
## 17 0.8165376 0.6330766
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
imp_vars_rf <- varImp(fit_rf_roc)
plot(imp_vars_rf, main="Variable Importance with RF")ROC-optimized model
predClasses_rf_roc <- predict(fit_rf_roc, newdata=df_test)
cm_rf_roc <- confusionMatrix(data = predClasses_rf_roc,
reference = df_test$stroke,
mode="everything",
positive="yes")
cm_rf_roc## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 958 16
## yes 257 46
##
## Accuracy : 0.7862
## 95% CI : (0.7627, 0.8084)
## No Information Rate : 0.9514
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1865
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.74194
## Specificity : 0.78848
## Pos Pred Value : 0.15182
## Neg Pred Value : 0.98357
## Precision : 0.15182
## Recall : 0.74194
## F1 : 0.25205
## Prevalence : 0.04855
## Detection Rate : 0.03602
## Detection Prevalence : 0.23727
## Balanced Accuracy : 0.76521
##
## 'Positive' Class : yes
##
set.seed(123)
fit_xgb_roc <- train(stroke ~ .,
data = df_train,
method = "xgbDART",
metric = "ROC",
trControl = fit_ctrl_xgb_roc,
verbose = FALSE,
verbosity = 0)
fit_xgb_roc$bestTuneShow metrics
fit_xgb_roc$results %>% filter(nrounds == 50, max_depth == 2, eta == 0.3, gamma == 0, subsample == 1, rate_drop==0.5, colsample_bytree == 0.6, skip_drop == 0.95)imp_vars_xgb <- varImp(fit_xgb_roc)
plot(imp_vars_xgb, main="Variable Importance with XGB")predClasses_xgb_roc <- predict(fit_xgb_roc, newdata=df_test)
cm_xgb_roc <- confusionMatrix(data = predClasses_xgb_roc,
reference = df_test$stroke,
mode="everything",
positive="yes")
cm_xgb_roc## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 921 16
## yes 294 46
##
## Accuracy : 0.7572
## 95% CI : (0.7328, 0.7805)
## No Information Rate : 0.9514
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1599
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.74194
## Specificity : 0.75802
## Pos Pred Value : 0.13529
## Neg Pred Value : 0.98292
## Precision : 0.13529
## Recall : 0.74194
## F1 : 0.22886
## Prevalence : 0.04855
## Detection Rate : 0.03602
## Detection Prevalence : 0.26625
## Balanced Accuracy : 0.74998
##
## 'Positive' Class : yes
##
get_roc <- function(fit.obj, testing.df){
pred_prob <- predict.train(fit.obj, newdata = testing.df, type="prob")
pred_roc <- prediction(predictions = pred_prob$yes, labels = testing.df$stroke)
perf_roc <- performance(pred_roc, measure="tpr", x.measure = "fpr")
return(list(perf_roc, pred_roc))
}# calculate ROC
perf_pred <- get_roc(fit_rf_roc, df_test)
perf_roc_rf <- perf_pred[[1]]
pred_roc_rf <- perf_pred[[2]]
# take AUC
auc_rf <- round(unlist(slot(performance(pred_roc_rf, measure = "auc"), "y.values")), 3)
# plot
plot(perf_roc_rf, main = "RF ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_rf))# calculate ROC
perf_pred <- get_roc(fit_xgb_roc, df_test)
perf_roc_xgb <- perf_pred[[1]]
pred_roc_xgb <- perf_pred[[2]]
# take AUC
auc_xgb <- round(unlist(slot(performance(pred_roc_xgb, measure = "auc"), "y.values")), 3)
# plot
plot(perf_roc_xgb, main = "XGB ROC curve", col = "steelblue", lwd = 3)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
legend(x = 0.7, y = 0.3, legend = paste0("AUC = ", auc_xgb))save.image("workspace.RData")